
library(doParallel)
library(tidyverse)
library(data.table)
library(tidytext)
library(dplyr)
library(stm)
library(haven)

rm(list = ls())

setwd("~/comments")

# read comments
data <- fread("kickstarter_comments.csv", sep=",", encoding = "UTF-8", header = TRUE)
names(data) <- c("author", "text", "time", "comment_url", "reply", "author_url", "badges", "canceled", "url")

data$date <- gsub(" .*","",data$time)
data$date <- dmy(data$date)

data$url <- gsub("\\?ref=kicktraq","",data$url)


# select ordinary backers and superbackers
data <- data[(data$badges=="" | data$badges=="superbacker"), ]
data <- subset(data, select = c(author, text, date, url))]
data <- as.data.frame(data)

processed <- textProcessor(data$text, metadata = data, language = "english", striphtml=TRUE)

out <- prepDocuments(processed$documents, processed$vocab, processed$meta,
                     lower.thresh=5, upper.thresh=5000)


K<-c(5, 10, 15, 20, 25, 30, 40, 50, 60, 80, 100, 150) 
kresult <- searchK(out$documents, out$vocab, K, prevalence =~ date + url, data=out$meta, max.em.its = 25)
print(kresult)
# --> 50 topics according to heldout and residual metrics

model <- stm(documents = out$documents, vocab = out$vocab, 
                       K = 50, prevalence =~ date + url,
                       max.em.its = 50, data = out$meta,
                       init.type = "Spectral")

proportions <- as.data.frame(model$theta)
proportions <- cbind(out$meta$author, out$meta$date, out$meta$url, proportions)
colnames(proportions)[colnames(proportions)=="out$meta$author"] <- "author"
colnames(proportions)[colnames(proportions)=="out$meta$date"] <- "date"
colnames(proportions)[colnames(proportions)=="out$meta$url"] <- "url"
write_dta(proportions, "topic proportions.dta", version = 14)

